{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1_WebScraping_President_Speech"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Step by Step for 1 page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup as bs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def page_scrapping(page_no):\n",
    "    \"\"\"\n",
    "    1) Creating URL with page number\n",
    "    2) requests.post()\n",
    "    3) bs\n",
    "    4) finding table\n",
    "    5) 'a' tag\n",
    "    6) adding a tag address\n",
    "    7) data frame\n",
    "    \"\"\"\n",
    "    url = \"https://www.pa.go.kr/research/contents/speech/index.jsp\"\n",
    "    params = f\"spMode=&artid=&catid=&pageIndex={page_no}&searchHistoryCount=0&searchStartDate=&searchEndDate=&pageUnit=20\"\n",
    "    \n",
    "    response = requests.post(url, params=params, verify=False)\n",
    "    \n",
    "    html = bs(response.text)\n",
    " \n",
    "    df = pd.read_html(response.text)[0]\n",
    " \n",
    "    a_list = html.select(\"#M_More > tr > td.subject > a\")\n",
    " \n",
    "    a_href = [a['href'] for a in a_list]\n",
    " \n",
    "    df[\"내용링크\"] = list(map(lambda x: url+x, a_href))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_content(url):\n",
    "    response = requests.post(url, verify=False)\n",
    "    html = bs(response.text)\n",
    "    content = html.select(\"#content > div > table > tbody > tr > td.content\")\n",
    "    \n",
    "    if len(content) == 0:\n",
    "        return '음성 기록 또는 동영상 기록'\n",
    "    else:\n",
    "        return content[0].text\n",
    "    \n",
    "    time.sleep(0.1)\n",
    "    return content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = page_scrapping(1)\n",
    "display(df.head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm.notebook import tqdm\n",
    "tqdm.pandas()\n",
    "\n",
    "df['내용'] = df['내용링크'].progress_map(get_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.drop('내용링크', axis=1)\n",
    "df.to_excel('연설문내용포함.xlsx', encoding='cp949', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Scraping from page 1 to 450"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup as bs\n",
    "import pandas as pd\n",
    "from tqdm.notebook import tqdm\n",
    "import time\n",
    "\n",
    "def page_scrapping(page_no):\n",
    "    \"\"\"\n",
    "    1) Creating URL with page number\n",
    "    2) requests.post()\n",
    "    3) bs\n",
    "    4) finding table\n",
    "    5) 'a' tag\n",
    "    6) adding a tag address\n",
    "    7) data frame\n",
    "    \"\"\"\n",
    "    url = \"https://www.pa.go.kr/research/contents/speech/index.jsp\"\n",
    "    params = {\n",
    "        'spMode': '',\n",
    "        'artid': '',\n",
    "        'catid': '',\n",
    "        'pageIndex': page_no,\n",
    "        'searchHistoryCount': 0,\n",
    "        'searchStartDate': '',\n",
    "        'searchEndDate': '',\n",
    "        'pageUnit': 20\n",
    "    }\n",
    "    \n",
    "    response = requests.post(url, data=params, verify=False)\n",
    "    html = bs(response.text, 'html.parser')\n",
    " \n",
    "    df = pd.read_html(response.text)[0]\n",
    " \n",
    "    a_list = html.select(\"#M_More > tr > td.subject > a\")\n",
    " \n",
    "    a_href = [a['href'] for a in a_list]\n",
    " \n",
    "    df[\"내용링크\"] = list(map(lambda x: url+x, a_href)) #내용링크 means link of speech text\n",
    "    \n",
    "    return df\n",
    "\n",
    "def get_content(url):\n",
    "    response = requests.post(url, verify=False)\n",
    "    html = bs(response.text, 'html.parser')\n",
    "    content = html.select(\"#content > div > table > tbody > tr > td.content\")\n",
    "    \n",
    "    if len(content) == 0:\n",
    "        return '음성 기록 또는 동영상 기록' #if it is video or voice\n",
    "    else:\n",
    "        return content[0].text\n",
    "    \n",
    "    time.sleep(0.1)\n",
    "    return content\n",
    "\n",
    "# Initialize an empty dataframe\n",
    "all_df = pd.DataFrame()\n",
    "\n",
    "# Scrape pages from 0 to 450. The last page is 449\n",
    "for page_no in tqdm(range(0, 451)):\n",
    "    df = page_scrapping(page_no)\n",
    "    all_df = pd.concat([all_df, df], ignore_index=True)\n",
    "\n",
    "# Add contents\n",
    "all_df['text'] = all_df['내용링크'].progress_map(get_content)\n",
    "\n",
    "# Drop '내용링크' column\n",
    "all_df = all_df.drop('내용링크', axis=1)\n",
    "\n",
    "# Save to Excel file\n",
    "all_df.to_excel('president_speech.xlsx', encoding='cp949', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Bring contents directly from df_speech_link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup as bs\n",
    "import pandas as pd\n",
    "import time\n",
    "\n",
    "# Function to get content from a URL\n",
    "def get_content(url):\n",
    "    response = requests.post(url, verify=False)\n",
    "    html = bs(response.text, 'html.parser')\n",
    "    content = html.select(\"#content > div > table > tbody > tr > td.content\")\n",
    "    \n",
    "    if len(content) == 0:\n",
    "        return '음성 기록 또는 동영상 기록'\n",
    "    else:\n",
    "        return content[0].text\n",
    "\n",
    "    time.sleep(0.1)\n",
    "    return content\n",
    "\n",
    "# Load the Excel file\n",
    "file_path = r'   \\df_speech_link.xlsx'\n",
    "df = pd.read_excel(file_path)\n",
    "\n",
    "# Extract content for each link\n",
    "df['Content'] = df['내용링크'].apply(get_content)\n",
    "\n",
    "# Save the DataFrame to a new Excel file with the content\n",
    "output_file_path = r'   \\df_speech_withcontents.xlsx'\n",
    "df.to_excel(output_file_path, encoding='cp949', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
